In [45]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
%matplotlib inline

In [46]:
#Create 2 binomial populations, pop1 (10,02,10000) and pop1(10,0.5,10000)

pop1 = np.random.binomial(10, 0.2, 10000)
pop2 = np.random.binomial(10,0.5, 10000) 
plt.hist(pop1, alpha=0.5, label='Population 1') 
plt.hist(pop2, alpha=0.5, label='Population 2') 
plt.legend(loc='upper right') 
plt.show()


print(pop1.mean())
print(pop2.mean())
print(pop1.std())
print(pop2.std())


1.9807
4.9955
1.25854181893
1.59244458302

In [47]:
#Create two samples, one per population of 100 datapoints each

sample1 = np.random.choice(pop1, 100, replace=True)
sample2 = np.random.choice(pop2, 100, replace=True)

plt.hist(sample1, alpha=0.5, label='sample 1') 
plt.hist(sample2, alpha=0.5, label='sample 2') 
plt.legend(loc='upper right') 
plt.show()



In [48]:
#Calculate mean and standard deviations for each sample

print(sample1.mean())
print(sample2.mean())
print(sample1.std())
print(sample2.std())


2.05
4.98
1.19478031453
1.68511127229

In [49]:
#Compare samples Calculate t-value & p-value.

from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp

mean1 = pop1.mean()
mean2 = pop2.mean()

print(ttest_1samp(sample1, mean1))
print(ttest_1samp(sample2, mean2))

print(ttest_ind(sample1, sample2, equal_var=False))


Ttest_1sampResult(statistic=0.57711554628629025, pvalue=0.56517148293785713)
Ttest_1sampResult(statistic=-0.091520990505217559, pvalue=0.92726355958658224)
Ttest_indResult(statistic=-14.112982639104573, pvalue=7.3319062702570705e-31)

In [50]:
#Increase the size of your samples from 100 to 1000
#Calculate the means and standard deviations for your new samples and create histograms for each.

sample3 = np.random.choice(pop1, 1000, replace=True)
sample4 = np.random.choice(pop2, 1000, replace=True)

print(sample3.mean())
print(sample4.mean())
print(sample3.std())
print(sample4.std())


plt.hist(sample3, alpha=0.5, label='sample 3') 
plt.hist(sample4, alpha=0.5, label='sample 4') 
plt.legend(loc='upper right') 
plt.show()


1.986
5.049
1.20656703088
1.56926702635

In [51]:
#Compare samples Calculate t-value & p-value.

from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp

mean1 = pop1.mean()
mean2 = pop2.mean()

print(ttest_1samp(sample3, mean1))
print(ttest_1samp(sample4, mean2))

print(ttest_ind(sample4, sample3, equal_var=False))


Ttest_1sampResult(statistic=0.13883761977862311, pvalue=0.88960644717886805)
Ttest_1sampResult(statistic=1.0775555714455496, pvalue=0.28149226535637206)
Ttest_indResult(statistic=48.907469276540127, pvalue=0.0)

In [52]:
#Decreasing the size of your samples to 20. What values change, and what remain the same?

sample3 = np.random.choice(pop1, 20, replace=True)
sample4 = np.random.choice(pop2, 20, replace=True)

print(sample3.mean())
print(sample4.mean())
print(sample3.std())
print(sample4.std())

plt.hist(sample3, alpha=0.5, label='sample 3') 
plt.hist(sample4, alpha=0.5, label='sample 4') 
plt.legend(loc='upper right') 
plt.show()


1.95
5.25
1.02347447452
2.04633819297

In [53]:
#Compare samples: calculate t-value & p-value.

from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp

mean1 = pop1.mean()
mean2 = pop2.mean()

print(ttest_1samp(sample3, mean1))
print(ttest_1samp(sample4, mean2))

print(ttest_ind(sample4, sample3, equal_var=False))


Ttest_1sampResult(statistic=-0.1307489350226092, pvalue=0.89734837284072033)
Ttest_1sampResult(statistic=0.542109698652528, pvalue=0.59404458242754909)
Ttest_indResult(statistic=6.2868398501043101, pvalue=8.5607569230615865e-07)

What values change, and what remain the same?

When running the one sample t test, p is always higher than 0.05, meaning that at 5% significane level the sample provides sufficient evidence to conclude that the mean of the sample is the calculated mean in all cases (for both populations)

Regarding the samples: means change, the lower the number of datapoints the less accurate the representation of the population.

T-values increase with the size while p-values tend to zero showing that the difference in means is due to the difference in the populations and not just variability.


In [54]:
#Change the population value for pop1 to 0.3

pop1 = np.random.binomial(10, 0.3, 10000)
pop2 = np.random.binomial(10,0.5, 10000) 
plt.hist(pop1, alpha=0.5, label='Population 1') 
plt.hist(pop2, alpha=0.5, label='Population 2') 
plt.legend(loc='upper right') 
plt.show()

print(pop1.mean())
print(pop2.mean())
print(pop1.std())
print(pop2.std())


2.976
5.0377
1.46219834496
1.58634129682

In [55]:
#Samples of the new pop1 and pop2

sample5 = np.random.choice(pop1, 100, replace=True)
sample6 = np.random.choice(pop2, 100, replace=True)

print(sample5.mean())
print(sample6.mean())
print(sample5.std())
print(sample6.std())


plt.hist(sample5, alpha=0.5, label='sample 5') 
plt.hist(sample6, alpha=0.5, label='sample 6') 
plt.legend(loc='upper right') 
plt.show()


2.96
5.21
1.58063278468
1.77929761423

In [56]:
#Compare samples Calculate t-value & p-value.

from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp

mean1 = pop1.mean()
mean2 = pop2.mean()

print(ttest_1samp(sample5, mean1))
print(ttest_1samp(sample6, mean2))

print(ttest_ind(sample6, sample5, equal_var=False))


Ttest_1sampResult(statistic=-0.10071788430571899, pvalue=0.91997810533545221)
Ttest_1sampResult(statistic=0.96350567798509745, pvalue=0.3376414166929137)
Ttest_indResult(statistic=9.4064768700972401, pvalue=1.4376893052899673e-17)

In [57]:
#Then change the population value p for group 1 to 0.4, and do it again

pop1 = np.random.binomial(10, 0.4, 10000)
pop2 = np.random.binomial(10,0.5, 10000) 
plt.hist(pop1, alpha=0.5, label='Population 1') 
plt.hist(pop2, alpha=0.5, label='Population 2') 
plt.legend(loc='upper right') 
plt.show()

print(pop1.mean())
print(pop2.mean())
print(pop1.std())
print(pop2.std())


3.9967
4.9723
1.54028864503
1.57446267342

In [58]:
#Samples of the new pop1 and pop2

sample5 = np.random.choice(pop1, 100, replace=True)
sample6 = np.random.choice(pop2, 100, replace=True)

print(sample5.mean())
print(sample6.mean())
print(sample5.std())
print(sample6.std())


plt.hist(sample5, alpha=0.5, label='sample 5') 
plt.hist(sample6, alpha=0.5, label='sample 6') 
plt.legend(loc='upper right') 
plt.show()


3.97
4.9
1.59031443432
1.59059737206

In [59]:
#Compare samples Calculate t-value & p-value.

from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp

mean1 = pop1.mean()
mean2 = pop2.mean()

print(ttest_1samp(sample5, mean1))
print(ttest_1samp(sample6, mean2))

print(ttest_ind(sample6, sample5, equal_var=False))


Ttest_1sampResult(statistic=-0.16704976071062602, pvalue=0.86767163156208615)
Ttest_1sampResult(statistic=-0.45226776408979114, pvalue=0.6520655370997569)
Ttest_indResult(statistic=4.1139964622425618, pvalue=5.7004154432148999e-05)

What changes, and why?

The t-value decreases in the second case (when p1=0.4, p2=0.5) and the p-value is much higher. The t value decreases because the difference between means is lower and the increase if the p-value shows the that the noise due to variability is growing in weach case


In [69]:
#Change the distribution of your populations from binomial to a distribution of your choice

pop3 = np.random.standard_t(25, 10000)
pop4 = logistic = np.random.logistic(9,2, 10000)
plt.hist(pop1, alpha=0.5, label='Population 3') 
plt.hist(pop2, alpha=0.5, label='Population 4') 
plt.legend(loc='upper right') 
plt.show()

print(pop1.mean())
print(pop2.mean())
print(pop1.std())
print(pop2.std())


3.9967
4.9723
1.54028864503
1.57446267342

In [70]:
#Samples of the new pop1 and pop2.

sample7 = np.random.choice(pop3, 100, replace=True)
sample8 = np.random.choice(pop4, 100, replace=True)

print(sample7.mean())
print(sample8.mean())
print(sample7.std())
print(sample8.std())


plt.hist(sample7, alpha=0.5, label='sample 7') 
plt.hist(sample8, alpha=0.5, label='sample 8') 
plt.legend(loc='upper right') 
plt.show()


-0.0693556002833
8.44423594223
1.10046169507
3.69634286444

In [71]:
#Compare samples Calculate t-value & p-value.

from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp

mean1 = pop3.mean()
mean2 = pop4.mean()

print(ttest_1samp(sample7, mean1))
print(ttest_1samp(sample8, mean2))

print(ttest_ind(sample8, sample7, equal_var=False))


Ttest_1sampResult(statistic=-0.63469053407407361, pvalue=0.52709432370300191)
Ttest_1sampResult(statistic=-1.45490650414748, pvalue=0.14885859701357648)
Ttest_indResult(statistic=21.96428183557293, pvalue=3.2444282398910242e-43)

Do the sample mean values still accurately represent the population values?

In this case, the distance between means is 21 times the standard error. Additionally, the p-value is close to zero therefore the difference we see is due to the difference between populations.